* Reset settings and initialize log file
launch, path("share/sorting")

*-------------------------------------------------------------------------------
* Price and Wasserman (2024), "The Summer Drop in Female Employment"
*
* Description: Explore sectoral sorting and associated earnings premia/penalties
*              in occupations represented in both the ed- and non-ed sectors.
*-------------------------------------------------------------------------------


* Determine the nadir of the minimum wage for use in sample selection
*-------------------------------------------------------------------------------

* Load the federal minimum wage
freduse PCEPI FEDMINNFRWG, clear
rename FEDMINNFRWG minwage

* Format date
gen int year = real(substr(date, 1, 4))
gen byte month = real(substr(date, 6, 2))
gen int tm = ym(year, month)
format tm %tm

* Restrict to our analysis period (including the previous year)
keep if inrange(year, 1988, 2019)

* Merge in the PCE price deflator
merge 1:1 tm using "$basepath/data/derived/pce.dta", assert(3) nogenerate

* Identify the lowest value of the real minimum wage
replace minwage = minwage/pce
sum minwage
local nadir = r(min)


* Prepare data and run models
*-------------------------------------------------------------------------------

if "$estimate" != "0" {

	* Identify occupations with adequate representation in/outside of education
	*---------------------------------------------------------------------------

	* Load data on adult individuals
	gzuse "$basepath/data/derived/cps_bms_sample.dta.gz", clear

	* Restrict to CPS monthly observations with known occupations
	keep if emp == 1 & !missing(occ1990_2d)

	* Record the number of distinct year-months
	gdistinct tm
	local periods = r(ndistinct)

	* Aggregate to occupation-school-sex level (pooling across time periods)
	gcollapse (sum) emp [pw = wtfinl], by(occ1990_2d school female)

	* Populate missing cells
	fillin occ1990_2d school female
	replace emp = 0 if _fillin == 1
	drop _fillin

	* Compute average employment over all months in our sample period
	replace emp = emp/`periods'

	* Compute the female share of employment in each occupation-sector cell
	reshape wide emp, i(occ1990_2d school) j(female)
	gen femsh = emp1/(emp0 + emp1)

	* Retain total employment and the female share of employment
	gen emp = emp0 + emp1
	drop emp0 emp1

	* Switch to one observation per occupation
	reshape wide emp femsh, i(occ1990_2d) j(school)
	order occ1990_2d emp* femsh*

	* Rename variables
	rename emp0 emp_noned
	rename emp1 emp_ed
	rename femsh0 femsh_noned
	rename femsh1 femsh_ed

	* Specify the threshold used to define occupations with enough/too few workers
	local threshold = 20000

	* List occupations with school-sector employment above the threshold
	tab occ1990_2d if emp_ed >= `threshold'

	* List occupations with both school and non-school emp above the threshold
	tab occ1990_2d if emp_ed >= `threshold' & emp_noned >= `threshold'

	* List occupations that exceed the school threshold but not the non-school
	tab occ1990_2d if emp_ed >= `threshold' & emp_noned < `threshold'

	* Tag occupations pass both thresholds
	gen byte overlapping = (emp_ed >= `threshold' & emp_noned >= `threshold')

	* Shorten and simplify occupation names
	label define occ1990_2d_lbl 00 "Exec., admin., & managerial", modify
	label define occ1990_2d_lbl 01 "Management related", modify
	label define occ1990_2d_lbl 02 "Engineers, architects, and surveyors", modify
	label define occ1990_2d_lbl 03 "Math and computer scientists", modify
	label define occ1990_2d_lbl 04 "Natural scientists", modify
	label define occ1990_2d_lbl 05 "Health diagnosing", modify
	label define occ1990_2d_lbl 06 "Health assess. & treating", modify
	label define occ1990_2d_lbl 07 "Therapists", modify
	label define occ1990_2d_lbl 09 "Teachers, except postsecondary", modify
	label define occ1990_2d_lbl 10 "Librarians & curators", modify
	label define occ1990_2d_lbl 11 "Social sci. & urban planners", modify
	label define occ1990_2d_lbl 12 "Social, rec., & relig. workers", modify
	label define occ1990_2d_lbl 14 "Writers, artists, & athletes", modify
	label define occ1990_2d_lbl 16 "Technologists, no health", modify
	label define occ1990_2d_lbl 17 "Technicians, no health/eng./sci.", modify
	label define occ1990_2d_lbl 19 "Sales reps., commodities", modify
	label define occ1990_2d_lbl 21 "Supervisors, admin. support", modify
	label define occ1990_2d_lbl 22 "Secretaries, stenographers", modify
	label define occ1990_2d_lbl 23 "Information clerks", modify
	label define occ1990_2d_lbl 24 "Records processing, no financial", modify
	label define occ1990_2d_lbl 25 "Financial records processing", modify
	label define occ1990_2d_lbl 26 "Office machine operators", modify
	label define occ1990_2d_lbl 30 "Adjusters and investigators", modify
	label define occ1990_2d_lbl 31 "Misc. admin. support", modify
	label define occ1990_2d_lbl 34 "Guards", modify
	label define occ1990_2d_lbl 35 "Food prep. and service", modify
	label define occ1990_2d_lbl 37 "Cleaning/building service, no hh", modify
	label define occ1990_2d_lbl 38 "Personal service", modify
	label define occ1990_2d_lbl 41 "Related agricultural", modify
	label define occ1990_2d_lbl 46 "Elect. equipment repairers", modify
	label define occ1990_2d_lbl 49 "Construction, no supervisors", modify
	label define occ1990_2d_lbl 70 "Motor vehicle operators", modify

	* Label variables
	label variable overlapping "Sufficient employment both in and outside of education sector"
	label variable emp_noned "Average employment outside of education sector"
	label variable emp_ed "Average employment in education sector"
	label variable femsh_noned "Female share of employment outside of education sector"
	label variable femsh_ed "Female share of employment in education sector"

	* Save information about these occupations
	order occ1990_2d overlapping emp_* femsh_*
	compress
	save "$basepath/models/sorting/overlapping_occs.dta", replace


	* Compute the education-sector earnings penalty/premium in each occupation
	*---------------------------------------------------------------------------

	* Load data on March basic monthly respondents matched to ASEC
	gzuse "$basepath/data/derived/cps_asec.dta.gz", clear
	keep pid hid tm year marbasecidp asecwt female educ age occ90ly_2d ind90ly incwage wkswork1 uhrsworkly

	* Retain male March ASEC respondents with a defined civilian occupation last year
	keep if female == 0 & !missing(occ90ly_2d) & occ90ly_2d != 76

	* Tag those who worked in the education sector last year
	gen byte schoolly = inlist(ind90ly, 842, 850, 851, 852, 860)

	* Trim low earnings: nadir of minimum wage, 10 hours/week, 20 weeks/year
	drop if incwage < (`nadir' * 10 * 20)

	* Log annual earnings
	gen ln_earnings = ln(incwage)

	* Decompose log earnings into weeks, hours/week, and hourly wage
	gen ln_weeks = ln(wkswork1)
	gen ln_hours = ln(uhrsworkly)
	gen ln_wage = ln_earnings - ln_weeks - ln_hours

	* Retain variables we need
	keep pid hid tm year asecwt educ age ln_* occ90ly_2d schoolly

	* Align variable names with those used earlier
	rename occ90ly_2d occ1990_2d
	rename schoolly school

	* Estimate occupation-specific log earnings penalties/premia
	foreach v in "earnings" "weeks" "hours" "wage" {
		reg ln_`v' i.occ1990_2d i.occ1990_2d#school i.educ c.age c.age#c.age i.year [aw = asecwt], vce(cluster hid)
		estimates save "$basepath/models/sorting/ed_penalty_`v'.ster", replace
	}


	* Measure the part-time share of employment in each job type
	*---------------------------------------------------------------------------

	* Load data on adult individuals
	gzuse "$basepath/data/derived/cps_bms_sample.dta.gz", clear

	* Restrict to employed men outside the summer months
	keep if female == 0 & emp == 1 & !inrange(month, 6, 8)

	* Record job characteristics by occupation x sector
	gcollapse (mean) ptemp [pw = wtfinl], by(occ1990_2d school)

	* Express part-time employment in percent terms
	replace ptemp = 100 * ptemp

	* Reshape
	reshape wide ptemp, i(occ1990_2d) j(school)
	rename *0 *_noned
	rename *1 *_ed

	* Save the part-time shares
	order occ1990_2d ptemp*
	compress
	save "$basepath/models/sorting/flexibility.dta", replace
}


* Combine information about occupational attributes
*-------------------------------------------------------------------------------

* Prepare school-sector earnings premia
foreach v in "earnings" "weeks" "hours" "wage" {
	* Load estimates
	estimates use "$basepath/models/sorting/ed_penalty_`v'.ster"
	tempfile premia
	regsave using `premia'
	use `premia', clear

	* Extract occupation, point estimate, and SE
	gen byte occ1990_2d = real(regexs(1)) if regexm(var, "([0-9]*)b?\.occ1990_2d#1.school")
	keep if !missing(occ1990_2d)
	rename coef `v'_penalty_beta
	rename stderr `v'_penalty_stde
	keep occ1990_2d `v'_penalty_*
	order occ1990_2d `v'_penalty_*
	tempfile premia_`v'
	save `premia_`v''
}

* Combine earnings premia and their subcomponents
use `premia_earnings', clear
merge 1:1 occ1990_2d using `premia_weeks', assert(3) nogenerate
merge 1:1 occ1990_2d using `premia_hours', assert(3) nogenerate
merge 1:1 occ1990_2d using `premia_wage', assert(3) nogenerate

* Verify additivity
assert abs(earnings_penalty_beta - (weeks_penalty_beta + hours_penalty_beta + wage_penalty_beta)) < .0001

* Merge with the list of occupational employment and female shares
merge 1:1 occ1990_2d using "$basepath/models/sorting/overlapping_occs.dta", assert(2 3) update nogenerate keepusing(overlapping femsh_*)
assert !missing(earnings_penalty_beta) if overlapping == 1

* Merge in the part-time share in each occupation
merge 1:1 occ1990_2d using "$basepath/models/sorting/flexibility.dta", assert(3) nogenerate

* Retain overlapping occupations
keep if overlapping == 1
drop overlapping
order occ1990_2d femsh_* *_penalty_* ptemp_*


* Plot occupational characteristics within and outside of education
*-------------------------------------------------------------------------------

foreach v in femsh ptemp {
	if "`v'" == "femsh" {
		local vlbl "Female share of employment"
		local col1 "lcolor(black) fcolor(gs6) lwidth(vthin)"
		local col2 "lcolor(black) fcolor(gs14) lwidth(vthin)"
	}
	else if "`v'" == "ptemp" {
		local vlbl "Part-time share of employment"
		local col1 "fcolor(*1.2)"
		local col2 "fcolor(*0.5)"
	}

	capture drop sortvar
	gen sortvar = -`v'_noned

	#delimit ;
	graph hbar `v'_noned `v'_ed,
		over(occ1990_2d, label(labsize(*0.65)) sort(sortvar) gap(*1.5))
		ytitle("`vlbl'", size(medsmall))
		ylabel(, labsize(medsmall))
		bar(1, `col1')
		bar(2, `col2')
		legend(size(medsmall) label(1 "Non-education sector") label(2 "Education sector"));
	#delimit cr

	nicepdf "$basepath/output/sorting_char_`v'.pdf", indirect replace

	* Report the number of occupations in which the characteristic is higher in education jobs
	quietly count if `v'_ed >= `v'_noned
	display "`v': higher in education for `=r(N)' occupations"
}


* Plot the earnings premium/penalty for working in the education sector
*-------------------------------------------------------------------------------

* Plot both overall premia and wage premia
foreach v in "earnings" "wage" {
	if "`v'" == "earnings" {
		local col "black"
	}
	else if "`v'" == "wage" {
		local col "$col1"
	}

	* Sort in descending order of premium
	sort `v'_penalty_beta
	capture drop n_coef_sort
	gen n_coef_sort = _n
	labmask n_coef_sort, values(occ1990_2d) decode

	* Compute confidence intervals
	capture drop beta_low beta_upp
	gen beta_low = `v'_penalty_beta - 1.96 * `v'_penalty_stde
	gen beta_upp = `v'_penalty_beta + 1.96 * `v'_penalty_stde

	* Plot estimates
	#delimit ;
	twoway
		(scatter n_coef_sort `v'_penalty_beta, mcolor(`col') msize(small) ylabel(1(1)`=_N', valuelabel labsize(small)))
		(rcap beta_upp beta_low n_coef_sort, horizontal lcolor(`col')),
		xtitle("Log `v' penalty/premium in the education sector", size(medsmall))
		xscale(range(-1 0.5))
		xlabel(-1(0.25)0.5, labsize(medsmall))
		xline(0)
		xsize(6.5)
		ytitle("")
		legend(off);
	#delimit cr

	nicepdf "$basepath/output/sorting_penalties_`v'.pdf", indirect replace
}

* Close the log file
unlaunch
